# Copyright 2019 by Michiel de Hoon.  All rights reserved.
# Based on code contributed and copyright 2016 by Peter Cock.
#
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
"""Bio.SeqIO support for the UCSC nib file format.

Nib stands for nibble (4 bit) representation of nucleotide sequences.
The two nibbles in a byte each store one nucleotide, represented numerically
as follows:

    0 - T
    1 - C
    2 - A
    3 - G
    4 - N (unknown)

A nib file contains only one sequence record.
You are expected to use this module via the Bio.SeqIO functions under
the format name "nib":

    >>> from Bio import SeqIO
    >>> record = SeqIO.read("Nib/test_bigendian.nib", "nib")
    >>> print("%i %s..." % (len(record), record.seq[:20]))
    37 ACGTAAACCGTACCCGTANA...

Notice that the sequence is given in upper case; unknown nucleotides are
written as N.

For detailed information on the file format, please see the UCSC
description at https://genome.ucsc.edu/FAQ/FAQformat.html.
"""

from __future__ import print_function

from Bio.SeqIO.Interfaces import SequenceWriter
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import struct
import sys

try:
    hex2bytes = bytes.fromhex  # python3
except AttributeError:
    hex2bytes = lambda s: s.decode('hex')  # python2

if sys.version_info < (3, ):
    # python2
    import binascii
    bytes2hex = binascii.hexlify
elif sys.version_info < (3, 5):
    # python3.4
    import binascii
    bytes2hex = lambda b: binascii.hexlify(b).decode('ascii')
else:
    # python3.5 and later
    bytes2hex = lambda b: b.hex()  # python3 later than python2.4

try:
    int.from_bytes  # python3
except AttributeError:
    def byte2int(b, byteorder):
        if byteorder == 'little':
            return struct.unpack("<i", b)[0]
        elif byteorder == 'big':
            return struct.unpack(">i", b)[0]
else:
    byte2int = lambda b, byteorder: int.from_bytes(b, byteorder)  # python3


try:
    maketrans = str.maketrans  # python3
except AttributeError:
    import string
    maketrans = string.maketrans


# This is a generator function!
def NibIterator(handle, alphabet=None):
    """Iterate over a nib file and yield a SeqRecord.

        - handle - input file in the nib file format as defibed by UCSC.
          This must be opened in binary mode!
        - alphabet - always ignored.

    Note that a nib file always contains only one sequence record.
    The sequence of the resulting SeqRecord object should match the sequence
    generated by Jim Kent's nibFrag utility, except that it will be in upper
    case, whereas nibFrag uses lower case.

    This function is used internally via the Bio.SeqIO functions:

    >>> from Bio import SeqIO
    >>> record = SeqIO.read("Nib/test_bigendian.nib", "nib")
    >>> print("%s %i" % (record.seq, len(record)))
    ACGTAAACCGTACCCGTANANCANNNNACNANNANCN 37

    You can also call it directly:

    >>> with open("Nib/test_bigendian.nib", "rb") as handle:
    ...     for record in NibIterator(handle):
    ...         print("%s %i" % (record.seq, len(record)))
    ...
    ACGTAAACCGTACCCGTANANCANNNNACNANNANCN 37

    """
    if alphabet is not None:
        raise ValueError("Alphabets are ignored.")
    word = handle.read(4)
    signature = bytes2hex(word)
    if signature == '3a3de96b':
        byteorder = 'little'  # little-endian
    elif signature == '6be93d3a':
        byteorder = 'big'  # big-endian
    else:
        raise ValueError('unexpected signature in Nib header')
    number = handle.read(4)
    length = byte2int(number, byteorder)
    data = handle.read()
    indices = bytes2hex(data)
    if length % 2 == 0:
        if len(indices) != length:
            raise ValueError('Unexpected file size')
    elif length % 2 == 1:
        if len(indices) != length + 1:
            raise ValueError('Unexpected file size')
        indices = indices[:length]
    if set(indices) != set('01234'):
        raise ValueError('Unexpected sequence data found in file')
    table = maketrans('01234', 'TCAGN')
    nucleotides = indices.translate(table)
    sequence = Seq(nucleotides)
    record = SeqRecord(sequence)
    yield record


class NibWriter(SequenceWriter):
    """Nib file writer."""

    def __init__(self, handle):
        """Initialize an Nib writer object.

        Arguments:
         - handle - Output handle, in binary write mode.
        """
        self.handle = handle
        byteorder = sys.byteorder
        if byteorder == 'little':  # little-endian
            signature = '3a3de96b'
        elif byteorder == 'big':  # big-endian
            signature = '6be93d3a'
        else:
            raise RuntimeError('unexpected system byte order %s' % byteorder)
        handle.write(hex2bytes(signature))

    def write_file(self, records):
        """Use this to write an entire file containing the given record."""
        count = 0
        for record in records:
            count += 1
        if count == 0:
            raise ValueError("Must have one sequence")
        if count > 1:
            raise ValueError('More than one sequence found')
        handle = self.handle
        sequence = record.seq
        nucleotides = str(sequence)
        length = len(sequence)
        handle.write(struct.pack('i', length))
        table = maketrans('TCAGNtcagn', '0123401234')
        padding = length % 2
        suffix = padding * 'T'
        nucleotides += suffix
        indices = nucleotides.translate(table)
        if set(indices) != set('01234'):
            raise ValueError('Sequence should contain A,C,G,T,N,a,c,g,t,n only')
        handle.write(hex2bytes(indices))
        return count


if __name__ == "__main__":
    from Bio._utils import run_doctest
    run_doctest(verbose=0)
